The data contains features extracted from the silhouette of vehicles in different angles. Four "Corgie" model vehicles were used for the experiment: a double decker bus, Cheverolet van, Saab 9000 and an Opel Manta 400 cars. This particular combination of vehicles was chosen with the expectation that the bus, van and either one of the cars would be readily distinguishable, but it would be more difficult to distinguish between the cars.
The purpose is to classify a given silhouette as one of three types of vehicle, using a set of features extracted from the silhouette. The vehicle may be viewed from one of many different angles.
● All the features are geometric features extracted from the silhouette. ● All are numeric in nature.
● Exploratory Data Analysis ● Reduce number dimensions in the dataset with minimal information loss ● Train a model using Principle Components
Apply dimensionality reduction technique – PCA and train a model using principle components instead of training the model using just the raw data.
import numpy as np
from sklearn.linear_model import LinearRegression
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy.stats import zscore
from sklearn.decomposition import PCA
from sklearn import svm
Data = pd.read_csv("vehicle-1.csv")
Data.head()
Data.describe().transpose
Data.columns
Data.info()
Data.shape
Data.median()
# replace the missing values with median value.
# Note, we do not need to specify the column names below
# every column's missing value is replaced with that column's median respectively (axis =0 means columnwise)
#Data = Data.fillna(Data.median())
medianFiller = lambda x: x.fillna(x.median() if x.name!='class' else x)
Data = Data.apply(medianFiller,axis=0)
Data.info()
X = Data.drop(labels="class" , axis = 1)
y = Data["class"]
X.head()
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
Data.columns
The column 'compactness' and 'pr.axis_aspect_ratio' have small covariance as well as correlation. The column 'compactness' and the columns 'distance_circularity', 'scatter_ratio','elongatedness', 'pr.axis_rectangularity', 'scaled_variance', 'scaled_variance.1'are highly correlated and covariance is also high above 0.75. Also other columns have very small covariance which we can observe in the below code output.
cv = Data.cov()
sorted_cv = cv.sort_values(by='compactness', axis=0, ascending=False)
sorted_cv
covMatrix = np.cov(X,rowvar=False)
print(covMatrix)
plt.figure(figsize = (12,12))
def plot_corr(X, size=11):
corr = X.corr()
sns.heatmap(corr, annot=True)
plot_corr(Data)
sns.pairplot(Data,diag_kind='kde')
columns = list(Data)[0:-1] # Excluding Outcome column
Data[columns].hist(stacked=False, bins=100, figsize=(12,30), layout=(14,2));
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,stratify = y)
# 42 is just any random seed number
y_train.value_counts()/y_train.shape[0]*100
y_train.value_counts().plot(kind='bar');
y_test.value_counts()/y_test.shape[0]*100
y_test.value_counts().plot(kind='bar');
svm_classifier = svm.SVC(gamma=0.025, C=3)
svm_classifier.fit(x_train , y_train)
y_pred = svm_classifier.predict(x_test)
print("Accuracy %0.2f " % (svm_classifier.score(x_train, y_train)))
print("Accuracy %0.2f " % (svm_classifier.score(x_test, y_test)))
with k-fold cross validation with s_splits = 10, we can get the accuracy of 0.97 with +/- 0.02 i.e. around 95 %
from sklearn.model_selection import cross_val_score
scores = cross_val_score(svm_classifier, X, y, cv=10, scoring='accuracy')
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
import numpy as np
from sklearn.model_selection import KFold
cv = KFold(n_splits=10, random_state=1, shuffle=True)
cv
scores = cross_val_score(svm_classifier, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
pca = PCA(n_components=18)
pca.fit(X)
print(pca.explained_variance_)
print(pca.components_)
print(pca.explained_variance_ratio_)
plt.bar(list(range(1,19)),pca.explained_variance_ratio_,alpha=0.5, align='center')
plt.ylabel('Variation explained')
plt.xlabel('eigen Value')
plt.show()
plt.step(list(range(1,19)),np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Cum of variation explained')
plt.xlabel('eigen Value')
plt.show()
pca7 = PCA(n_components=7)
pca7.fit(X)
print(pca7.components_)
print(pca7.explained_variance_ratio_)
Xpca7 = pca7.transform(X)
Xpca7
sns.pairplot(pd.DataFrame(Xpca7))
xpca_train, xpca_test, ypca_train, ypca_test = train_test_split(Xpca7, y, test_size=0.3, random_state=42)
clf_pca = svm.SVC(gamma=0.025, C=1000)
clf_pca.fit(xpca_train , ypca_train)
print("Train Accuracy %0.2f " % (clf_pca.score(xpca_train, ypca_train)))
print("Test Accuracy %0.2f " % (clf_pca.score(xpca_test, ypca_test)))
print("Accuracy %0.2f " % (svm_classifier.score(x_train, y_train)))
print("Accuracy %0.2f " % (svm_classifier.score(x_test, y_test)))
print("with PCA Train Accuracy %0.2f " % (clf_pca.score(xpca_train, ypca_train)))
print("with PCA Test Accuracy %0.2f " % (clf_pca.score(xpca_test, ypca_test)))